{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4"
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "pip install -U pyarrow --quiet"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "3XudvZulCKaU",
        "outputId": "c023ae50-6e61-4d88-ca9f-db1877c5063f"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Requirement already satisfied: pyarrow in /usr/local/lib/python3.10/dist-packages (17.0.0)\n",
            "Requirement already satisfied: numpy>=1.16.6 in /usr/local/lib/python3.10/dist-packages (from pyarrow) (1.26.4)\n"
          ]
        }
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "PxDn3GQoADYZ",
        "outputId": "83d04b64-d2e2-4a8b-b7b9-1689f3e99ee2"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/84.1 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.1/84.1 kB\u001b[0m \u001b[31m4.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25h"
          ]
        }
      ],
      "source": [
        "pip install datasets transformers torch seqeval evaluate  --quiet"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "from datasets import load_dataset\n",
        "from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer\n",
        "import numpy as np\n",
        "import evaluate\n",
        "\n",
        "# Load the IMDB dataset\n",
        "dataset = load_dataset(\"imdb\")\n",
        "\n",
        "# Initialize tokenizer and model\n",
        "model_name = \"bert-base-uncased\"\n",
        "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
        "model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)\n",
        "\n",
        "# Tokenization function\n",
        "def tokenize_function(examples):\n",
        "    return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True, max_length=128)\n",
        "\n",
        "# Tokenize datasets\n",
        "tokenized_datasets = dataset.map(tokenize_function, batched=True)\n",
        "\n",
        "# Rename the label column to 'labels' as expected by the model\n",
        "tokenized_datasets = tokenized_datasets.rename_column(\"label\", \"labels\")\n",
        "\n",
        "# Set format for PyTorch\n",
        "tokenized_datasets.set_format(\"torch\", columns=[\"input_ids\", \"attention_mask\", \"labels\"])\n",
        "\n",
        "# Create a smaller subset for demonstration purposes\n",
        "small_train_dataset = tokenized_datasets[\"train\"].shuffle(seed=42).select(range(5000))\n",
        "small_eval_dataset = tokenized_datasets[\"test\"].shuffle(seed=42).select(range(1000))\n",
        "\n",
        "# Define training arguments\n",
        "training_args = TrainingArguments(\n",
        "    output_dir=\"./results\",\n",
        "    num_train_epochs=3,\n",
        "    per_device_train_batch_size=16,\n",
        "    per_device_eval_batch_size=16,\n",
        "    warmup_steps=500,\n",
        "    weight_decay=0.01,\n",
        "    logging_dir=\"./logs\",\n",
        "    logging_steps=1000,\n",
        "    save_steps=1000,\n",
        "    eval_strategy=\"epoch\",\n",
        "    save_strategy=\"epoch\",\n",
        "    load_best_model_at_end=True,\n",
        "    metric_for_best_model=\"accuracy\",\n",
        ")\n",
        "\n",
        "# Load the accuracy metric\n",
        "metric = evaluate.load(\"accuracy\")\n",
        "\n",
        "# Define compute metrics function\n",
        "def compute_metrics(eval_pred):\n",
        "    logits, labels = eval_pred\n",
        "    predictions = np.argmax(logits, axis=-1)\n",
        "    return metric.compute(predictions=predictions, references=labels)\n",
        "\n",
        "# Initialize Trainer\n",
        "trainer = Trainer(\n",
        "    model=model,\n",
        "    args=training_args,\n",
        "    train_dataset=small_train_dataset,\n",
        "    eval_dataset=small_eval_dataset,\n",
        "    tokenizer=tokenizer,\n",
        "    compute_metrics=compute_metrics\n",
        ")\n",
        "\n",
        "# Evaluate the model\n",
        "eval_results = trainer.evaluate()\n",
        "print(f\"Evaluation results: {eval_results}\")\n",
        "\n",
        "# Train the model\n",
        "trainer.train()\n",
        "\n",
        "# Evaluate the model\n",
        "eval_results = trainer.evaluate()\n",
        "print(f\"Evaluation results: {eval_results}\")\n",
        "\n",
        "# Save the model\n",
        "trainer.save_model(\"./fine_tuned_bert_imdb\")\n",
        "\n",
        "# Function to predict sentiment\n",
        "def predict_sentiment(text):\n",
        "    # Determine the device\n",
        "    device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
        "\n",
        "    # Move the model to the appropriate device\n",
        "    model.to(device)\n",
        "\n",
        "    # Tokenize and move to the same device as the model\n",
        "    inputs = tokenizer(text, return_tensors=\"pt\", padding=True, truncation=True, max_length=128)\n",
        "    inputs = {k: v.to(device) for k, v in inputs.items()}\n",
        "\n",
        "    model.eval()  # Set the model to evaluation mode\n",
        "    with torch.no_grad():\n",
        "        outputs = model(**inputs)\n",
        "\n",
        "    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)\n",
        "    predicted_class = torch.argmax(probabilities, dim=-1).item()\n",
        "    return \"Positive\" if predicted_class == 1 else \"Negative\"\n",
        "\n",
        "# Test the model with a new sentence\n",
        "test_sentence = \"This movie was absolutely fantastic! I loved every minute of it.\"\n",
        "print(f\"Sentiment: {predict_sentiment(test_sentence)}\")\n",
        "\n"
      ],
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 316
        },
        "id": "zDR98U-gAwF1",
        "outputId": "cdb83d98-acd8-4010-c8de-56a3a765fe8e"
      },
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": [
            "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']\n",
            "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ],
            "text/html": [
              "\n",
              "    <div>\n",
              "      \n",
              "      <progress value='126' max='63' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
              "      [63/63 02:04]\n",
              "    </div>\n",
              "    "
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Evaluation results: {'eval_loss': 0.7108113765716553, 'eval_accuracy': 0.513, 'eval_runtime': 6.6601, 'eval_samples_per_second': 150.148, 'eval_steps_per_second': 9.459}\n"
          ]
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ],
            "text/html": [
              "\n",
              "    <div>\n",
              "      \n",
              "      <progress value='939' max='939' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
              "      [939/939 07:50, Epoch 3/3]\n",
              "    </div>\n",
              "    <table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              " <tr style=\"text-align: left;\">\n",
              "      <th>Epoch</th>\n",
              "      <th>Training Loss</th>\n",
              "      <th>Validation Loss</th>\n",
              "      <th>Accuracy</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <td>1</td>\n",
              "      <td>No log</td>\n",
              "      <td>0.388412</td>\n",
              "      <td>0.844000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>2</td>\n",
              "      <td>No log</td>\n",
              "      <td>0.387161</td>\n",
              "      <td>0.848000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>3</td>\n",
              "      <td>No log</td>\n",
              "      <td>0.497519</td>\n",
              "      <td>0.857000</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table><p>"
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "display_data",
          "data": {
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ],
            "text/html": [
              "\n",
              "    <div>\n",
              "      \n",
              "      <progress value='63' max='63' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
              "      [63/63 00:06]\n",
              "    </div>\n",
              "    "
            ]
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "Evaluation results: {'eval_loss': 0.49751895666122437, 'eval_accuracy': 0.857, 'eval_runtime': 7.0677, 'eval_samples_per_second': 141.489, 'eval_steps_per_second': 8.914, 'epoch': 3.0}\n",
            "Sentiment: Positive\n"
          ]
        }
      ]
    }
  ]
}